{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_excel('data.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q1_性别</th>\n",
       "      <th>Q2_身高（厘米）</th>\n",
       "      <th>Q3_体重 （公斤）</th>\n",
       "      <th>Q4_头发长度（厘米）</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>男</td>\n",
       "      <td>190</td>\n",
       "      <td>70</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>女</td>\n",
       "      <td>160</td>\n",
       "      <td>45</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>男</td>\n",
       "      <td>179</td>\n",
       "      <td>61</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>女</td>\n",
       "      <td>173</td>\n",
       "      <td>60</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>男</td>\n",
       "      <td>175</td>\n",
       "      <td>70</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Q1_性别  Q2_身高（厘米）  Q3_体重 （公斤）  Q4_头发长度（厘米）\n",
       "0     男        190          70            7\n",
       "1     女        160          45           20\n",
       "2     男        179          61            5\n",
       "3     女        173          60           50\n",
       "4     男        175          70           15"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.rename(columns={'Q1_性别': 'label', \n",
    "                            'Q2_身高（厘米）': 'height', \n",
    "                            'Q3_体重 （公斤）': 'weight', \n",
    "                            'Q4_头发长度（厘米）': 'hair'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['label'] = data['label'].apply(lambda x : {'男': 0, '女': 1}[x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>height</th>\n",
       "      <th>weight</th>\n",
       "      <th>hair</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>190</td>\n",
       "      <td>70</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>160</td>\n",
       "      <td>45</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>179</td>\n",
       "      <td>61</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>173</td>\n",
       "      <td>60</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>175</td>\n",
       "      <td>70</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label  height  weight  hair\n",
       "0      0     190      70     7\n",
       "1      1     160      45    20\n",
       "2      0     179      61     5\n",
       "3      1     173      60    50\n",
       "4      0     175      70    15"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = data[['height', 'weight', 'hair']].to_numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean = np.mean(features, axis=0)\n",
    "std = np.std(features, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = (features - mean)/std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "label = data['label'].to_numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 9.86969285e-01, -2.25614423e-03, -4.63916995e-01],\n",
       "       [-7.03854590e-01, -8.10707828e-01, -1.44545566e-01],\n",
       "       [ 3.67000531e-01, -2.93298750e-01, -5.13051061e-01],\n",
       "       [ 2.88357560e-02, -3.25636818e-01,  5.92465423e-01],\n",
       "       [ 1.41557348e-01, -2.25614423e-03, -2.67380731e-01],\n",
       "       [-5.91132998e-01, -5.52003289e-01,  1.01124764e-01],\n",
       "       [-1.40246631e-01,  3.21124529e-01, -5.13051061e-01],\n",
       "       [-2.52968223e-01, -4.87327154e-01, -5.13051061e-01],\n",
       "       [ 1.41557348e-01, -1.63946481e-01, -5.37618094e-01],\n",
       "       [ 1.41557348e-01, -2.25614423e-03, -5.37618094e-01],\n",
       "       [-8.16576182e-01, -7.46031693e-01,  5.92465423e-01],\n",
       "       [ 4.23361327e-01, -6.69322789e-02, -5.62185127e-01],\n",
       "       [ 4.23361327e-01,  1.59434192e-01, -5.13051061e-01],\n",
       "       [-7.03854590e-01, -6.81355558e-01, -1.44545566e-01],\n",
       "       [-3.09329019e-01, -3.25636818e-01, -3.41081830e-01],\n",
       "       [-7.03854590e-01, -6.81355558e-01,  3.46795093e-01],\n",
       "       [ 1.41557348e-01, -1.31608414e-01, -2.67380731e-01],\n",
       "       [ 8.74247694e-01, -2.25614423e-03, -2.67380731e-01],\n",
       "       [-8.38858357e-02, -2.28622616e-01, -3.90215896e-01],\n",
       "       [ 1.41557348e-01,  1.59434192e-01, -3.90215896e-01],\n",
       "       [-2.75250398e-02, -1.63946481e-01, -5.62185127e-01],\n",
       "       [-7.03854590e-01, -5.84341356e-01,  5.92465423e-01],\n",
       "       [ 4.79722123e-01,  1.91772260e-01, -3.41081830e-01],\n",
       "       [ 1.41557348e-01, -3.25636818e-01,  1.82081707e+00],\n",
       "       [-2.75250398e-02, -3.57974885e-01, -5.13051061e-01],\n",
       "       [ 2.88357560e-02, -2.93298750e-01, -5.62185127e-01],\n",
       "       [-1.40246631e-01, -3.25636818e-01, -4.63916995e-01],\n",
       "       [ 4.23361327e-01, -3.90312952e-01, -5.13051061e-01],\n",
       "       [-1.96607427e-01, -4.87327154e-01,  1.01124764e-01],\n",
       "       [ 4.23361327e-01, -1.96284548e-01,  5.92465423e-01],\n",
       "       [-7.03854590e-01, -6.49017491e-01,  3.46795093e-01],\n",
       "       [-2.75250398e-02, -2.60960683e-01, -5.62185127e-01],\n",
       "       [ 3.10639735e-01, -2.25614423e-03, -4.88484028e-01],\n",
       "       [ 3.10639735e-01, -2.25614423e-03, -3.90215896e-01],\n",
       "       [ 8.51965518e-02,  1.61464722e+00, -3.90215896e-01],\n",
       "       [ 4.31225624e+00,  4.16935454e+00,  3.73704564e+00],\n",
       "       [ 7.05165306e-01, -3.25636818e-01, -2.67380731e-01],\n",
       "       [-4.08550234e+00,  4.16935454e+00,  4.22838630e+00],\n",
       "       [-5.34772202e-01, -4.54989087e-01,  1.01124764e-01],\n",
       "       [-8.38858357e-02,  1.59434192e-01, -5.62185127e-01],\n",
       "       [ 2.88357560e-02, -1.63946481e-01, -5.13051061e-01],\n",
       "       [-1.96607427e-01, -5.19665222e-01, -9.54115001e-02],\n",
       "       [-2.75250398e-02,  6.24199904e-02, -5.62185127e-01]])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,\n",
       "       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Picking a Link Function\n",
    "Generalized linear models usually tranform a linear model of the predictors by using a [link function](https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function). In logistic regression, the link function is the [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function). We can implement this really easily."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sigmoid(scores):\n",
    "    return 1 / (1 + np.exp(-scores))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Maximizing the Likelihood"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To maximize the likelihood, I need a way to compute the likelihood and the gradient of the likelihood. Fortunately, the likelihood (for binary classification) can be reduced to a fairly intuitive form by switching to the log-likelihood. We're able to do this without affecting the weights parameter estimation because log transformation are [monotonic](https://en.wikipedia.org/wiki/Monotonic_function).\n",
    "\n",
    "For anyone interested in the derivations of the functions I'm using, check out Section 4.4.1 of Hastie, Tibsharani, and Friedman's [Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/). For those less mathematically inclined, Carlos Guestrin (Univesity of Washington) details one possible derivation of the log-likelihood in a series of short lectures on [Coursera](https://www.coursera.org/learn/ml-classification/lecture/1ZeTC/very-optional-expressing-the-log-likelihood) using indicator functions.\n",
    "\n",
    "## Calculating the Log-Likelihood\n",
    "\n",
    "The log-likelihood can be viewed as as sum over all the training data. Mathematically,\n",
    "\n",
    "$$\\begin{equation}\n",
    "ll = \\sum_{i=1}^{N}y_{i}\\beta ^{T}x_{i} - log(1+e^{\\beta^{T}x_{i}})\n",
    "\\end{equation}$$\n",
    "\n",
    "where $y$ is the target class, $x_{i}$ represents an individual data point, and $\\beta$ is the weights vector.\n",
    "\n",
    "I can easily turn that into a function and take advantage of matrix algebra."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "def log_likelihood(features, target, weights):\n",
    "    scores = np.dot(features, weights)\n",
    "    ll = np.sum( target*scores - np.log(1 + np.exp(scores)) )\n",
    "    return ll"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculating the Gradient\n",
    "\n",
    "Now I need an equation for the gradient of the log-likelihood. By taking the derivative of the equation above and reformulating in matrix form, the gradient becomes: \n",
    "\n",
    "$$\\begin{equation}\n",
    "\\bigtriangledown ll = X^{T}(Y - Predictions)\n",
    "\\end{equation}$$\n",
    "\n",
    "Again, this is really easy to implement. It's so simple I don't even need to wrap it into a function. The gradient here looks very similar to the output layer gradient in a neural network (see my [post](https://beckernick.github.io/neural-network-scratch/) on neural networks if you're curious).\n",
    "\n",
    "This shouldn't be too surprising, since a neural network is basically just a series of non-linear link functions applied after linear manipulations of the input data."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Building the Logistic Regression Function\n",
    "\n",
    "Finally, I'm ready to build the model function. I'll add in the option to calculate the model with an intercept, since it's a good option to have."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def logistic_regression(features, target, num_steps, learning_rate, add_intercept = False):\n",
    "    if add_intercept:\n",
    "        intercept = np.ones((features.shape[0], 1))\n",
    "        features = np.hstack((intercept, features))\n",
    "        \n",
    "    weights = np.zeros(features.shape[1])\n",
    "    \n",
    "    for step in range(num_steps):\n",
    "        scores = np.dot(features, weights)\n",
    "        predictions = sigmoid(scores)\n",
    "\n",
    "        # Update weights with log likelihood gradient\n",
    "        output_error_signal = target - predictions\n",
    "        \n",
    "        gradient = np.dot(features.T, output_error_signal)\n",
    "        weights += learning_rate * gradient\n",
    "\n",
    "        # Print log-likelihood every so often\n",
    "        if step % 10000 == 0:\n",
    "            print(log_likelihood(features, target, weights))\n",
    "        \n",
    "    return weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Time to do the regression."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-29.794300659677482\n",
      "-12.17665666438134\n",
      "-10.368615637493027\n",
      "-9.699007253564737\n",
      "-9.355548233950786\n"
     ]
    }
   ],
   "source": [
    "weights = logistic_regression(features, label,\n",
    "                     num_steps = 50000, learning_rate = 5e-5, add_intercept=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[-1.62788588 -3.1418227  -2.31358577  2.1596935 ]\n"
     ]
    }
   ],
   "source": [
    "print(weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(features, weights):\n",
    "    global mean\n",
    "    global std\n",
    "    features = (features - mean)/std\n",
    "    intercept = np.ones((features.shape[0], 1))\n",
    "    features = np.hstack((intercept, features))\n",
    "    scores = np.dot(features, weights)\n",
    "    predictions = sigmoid(scores)\n",
    "    \n",
    "    return predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00115921]\n"
     ]
    }
   ],
   "source": [
    "student1 = np.array([[188, 85, 2]])\n",
    "print(predict(student1, weights))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.76002054]\n"
     ]
    }
   ],
   "source": [
    "student2 = np.array([[165, 50, 25]])\n",
    "print(predict(student2, weights))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
